Kapitel 6.2: Zentralität – Vektoren Distanzen¶
Das Notebook ergänzt Kapitel 6.2 'Zentralität'.
Import¶
In [1]:
import pandas as pd
import plotly.express as px
from tqdm.notebook import tqdm
from resources_geschichtslyrik import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from scipy.spatial import distance
from scipy.stats import entropy
In [2]:
meta = pd.read_json(r"../resources/meta.json")
In [3]:
features_used_df = pd.read_csv("../resources/more/vectors/vectordist_features.csv", index_col = [0])
meta_all_features = pd.read_csv("../resources/more/vectors/vectordist.csv", index_col = [0])
features_used = features_used_df['feature'].tolist()
In [4]:
meta_mode_strikt = pd.read_csv("../resources/more/vectors/mode_strikt.csv", index_col = [0])
meta_mode_flexibel = pd.read_csv("../resources/more/vectors/mode_flexibel.csv", index_col = [0])
Korpora¶
In [5]:
meta_anth = (
meta
.query("corpus=='anth'")
.query("1850 <= year <= 1918")
.query("geschichtslyrik == 1")
.drop_duplicates(subset='author_title')
.reset_index(drop = True)
)
In [6]:
modcanon_authors = ['Hofmannsthal, Hugo von', 'Rilke, Rainer Maria', 'George, Stefan', 'Heym, Georg']
meta_modcanon = (
meta
.query("author in @modcanon_authors")
.query("1850 <= year <= 1918")
.query("geschichtslyrik == 1")
.drop_duplicates(subset='author_title')
.reset_index(drop = True)
)
In [7]:
muench_authors = ['Münchhausen, Börries von', 'Miegel, Agnes', 'Strauß und Torney, Lulu von']
meta_muench = (
meta
.query("author in @muench_authors")
.query("1850 <= year <= 1918")
.query("geschichtslyrik == 1")
.drop_duplicates(subset='author_title')
.reset_index(drop = True)
)
In [8]:
meta_all = pd.concat([meta_anth, meta_modcanon, meta_muench])
meta_all = meta_all.drop_duplicates(subset = 'id')
meta_all = meta_all.reset_index(drop = True)
meta_all['korpus_anth'] = [True if x in list(meta_anth['author_title']) else False for x in meta_all['author_title']]
meta_all['korpus_modcanon'] = [True if x in modcanon_authors else False for x in meta_all['author']]
meta_all['korpus_muench'] = [True if x in muench_authors else False for x in meta_all['author']]
meta_all.shape[0]
Out[8]:
2063
In [9]:
print(meta_all.shape[0])
print(meta_all_features.shape[0])
print(meta_mode_strikt.shape[0])
print(meta_mode_flexibel.shape[0])
2063 2063 2063 2063
In [10]:
meta_all = meta_all.merge(meta_all_features, on = 'id')
meta_all = meta_all.merge(meta_mode_strikt, on = 'id')
meta_all = meta_all.merge(meta_mode_flexibel, on = 'id')
meta_all.shape[0]
Out[10]:
2063
Feature-Übersicht¶
In [11]:
features_used_df
Out[11]:
| feature | encoding | weight | encoding_orig | |
|---|---|---|---|---|
| 0 | vectortyp_geschichtslyrik | ordinal | 1.00 | NaN |
| 1 | vectortyp_empirisch | bin | 1.00 | NaN |
| 2 | vectortyp_theoretisch | bin | 1.00 | NaN |
| 3 | vectortyp_sprechinstanz_markiert | bin | 1.00 | NaN |
| 4 | vectortyp_konkretheit | ordinal | 1.00 | NaN |
| ... | ... | ... | ... | ... |
| 1137 | vectortyp_geschichtsauffassung_bewertung_ambiv... | bin | 0.20 | nominal |
| 1138 | vectortyp_verhaeltnis_wissen_ergänzend | bin | 0.25 | nominal_multi |
| 1139 | vectortyp_verhaeltnis_wissen_übereinstimmend | bin | 0.25 | nominal_multi |
| 1140 | vectortyp_verhaeltnis_wissen_abweichend_überna... | bin | 0.25 | nominal_multi |
| 1141 | vectortyp_verhaeltnis_wissen_abweichend_natürlich | bin | 0.25 | nominal_multi |
1142 rows × 4 columns
In [12]:
meta_all[[
'vectortyp_geschichtslyrik',
'vectortyp_zeitebenen',
'vectortyp_beginn',
'vectortyp_entity_simple_1',
'vectortyp_entity_bewertung_1_1',
'vectortyp_stoffgebiete_dim_1',
]].sample(n=10)
Out[12]:
| vectortyp_geschichtslyrik | vectortyp_zeitebenen | vectortyp_beginn | vectortyp_entity_simple_1 | vectortyp_entity_bewertung_1_1 | vectortyp_stoffgebiete_dim_1 | |
|---|---|---|---|---|---|---|
| 696 | 1.0 | 2.0 | 1860.0 | 0 | 0 | -0.853739 |
| 107 | 1.0 | 1.0 | 1757.0 | 1 | 1 | -0.987243 |
| 411 | 1.0 | 2.0 | 1631.0 | 1 | 0 | -0.987243 |
| 1607 | 1.0 | 2.0 | 1627.0 | 1 | 1 | -1.761696 |
| 147 | 1.0 | 2.0 | 1851.0 | 1 | 1 | -1.618742 |
| 997 | 1.0 | 2.0 | 1189.0 | 0 | 0 | -0.987243 |
| 1723 | 1.0 | 4.0 | 1229.0 | 2 | 0 | -1.277816 |
| 1003 | 1.0 | 1.0 | 1190.0 | 1 | 1 | 2.394565 |
| 2007 | 1.0 | 2.0 | 1150.0 | 0 | 0 | -1.903517 |
| 709 | 1.0 | 2.0 | 1871.0 | 1 | 1 | -0.987243 |
In [13]:
round(features_used_df['weight'].sum(), 5)
Out[13]:
40.0
Weitere Gewichtung der Features¶
- weights = Faktor, mit dem die jeweilige Feature-Gruppe gewichtet wird, z. B. 2 = doppelt so stark
In [14]:
weights = {
'gattung' : 1,
'stoffgebiete_dim' : 1,
'beginn' : 1,
'ende' : 1,
}
In [15]:
for weight in weights:
this_index = features_used_df['feature'].str.contains(weight)
current_values = features_used_df.loc[this_index, 'weight']
if current_values.sum() != weights[weight]:
features_used_df.loc[this_index, 'weight_orig'] = 1/sum(this_index)
features_used_df.loc[this_index, 'weight'] = weights[weight]/sum(this_index)
Skalierung¶
In [16]:
# scaler = StandardScaler()
# scaler = RobustScaler()
scaler = MinMaxScaler()
In [17]:
meta_all[features_used] = scaler.fit_transform(meta_all[features_used])
In [18]:
meta_all[[
'vectortyp_geschichtslyrik',
'vectortyp_zeitebenen',
'vectortyp_beginn',
'vectortyp_entity_simple_1',
'vectortyp_entity_bewertung_1_1',
'vectortyp_stoffgebiete_dim_1',
]].sample(n=10)
Out[18]:
| vectortyp_geschichtslyrik | vectortyp_zeitebenen | vectortyp_beginn | vectortyp_entity_simple_1 | vectortyp_entity_bewertung_1_1 | vectortyp_stoffgebiete_dim_1 | |
|---|---|---|---|---|---|---|
| 1816 | 0.0 | 0.0 | 0.984538 | 0.00 | 0.00 | 0.352816 |
| 1887 | 0.0 | 0.0 | 0.651734 | 0.00 | 0.00 | 0.352816 |
| 117 | 0.0 | 0.2 | 0.990462 | 0.00 | 0.00 | 0.352816 |
| 145 | 0.0 | 0.4 | 0.982225 | 0.25 | 0.25 | 0.384847 |
| 1720 | 0.0 | 0.4 | 0.863150 | 0.50 | 0.50 | 0.278450 |
| 1662 | 0.0 | 0.4 | 0.919220 | 0.50 | 0.25 | 0.373682 |
| 530 | 0.0 | 0.0 | 0.981358 | 0.25 | 0.25 | 0.317260 |
| 605 | 0.0 | 0.2 | 0.913006 | 0.50 | 0.25 | 0.401228 |
| 1247 | 0.0 | 0.2 | 0.714884 | 0.25 | 0.00 | 0.352816 |
| 812 | 0.0 | 0.2 | 0.943642 | 0.50 | 0.25 | 0.805181 |
Texte mit fehlenden Daten ignorieren¶
In [19]:
for element in meta_all.iloc:
missing = element[features_used].isnull()
if sum(missing) > 0:
print(element.id)
print(element[features_used][missing.values])
print("\n")
In [20]:
meta_all = meta_all.dropna(subset=features_used).copy()
meta_all = meta_all.reset_index(drop = True)
In [21]:
for element in meta_all.iloc:
missing = element[features_used].isnull()
if sum(missing) > 0:
print(element.id)
print(element[features_used][missing.values])
print("\n")
Zentroid und Abstand zum Zentroid berechnen¶
In [22]:
centroid = meta_all.query("korpus_anth")[features_used].mean()
In [23]:
centroid
Out[23]:
vectortyp_geschichtslyrik 0.000000
vectortyp_empirisch 0.997297
vectortyp_theoretisch 0.030270
vectortyp_sprechinstanz_markiert 0.219189
vectortyp_konkretheit 0.902162
...
vectortyp_geschichtsauffassung_bewertung_ambivalent 0.001622
vectortyp_verhaeltnis_wissen_ergänzend 0.747568
vectortyp_verhaeltnis_wissen_übereinstimmend 0.142703
vectortyp_verhaeltnis_wissen_abweichend_übernatürlich 0.107027
vectortyp_verhaeltnis_wissen_abweichend_natürlich 0.003243
Length: 1142, dtype: float64
In [24]:
for i, element in enumerate(meta_all.iloc):
meta_all.at[i, 'dist_centroid_manhattan'] = distance.cityblock(element[features_used], centroid, w = features_used_df['weight'])
meta_all.at[i, 'dist_centroid_euclidean'] = distance.euclidean(element[features_used], centroid, w = features_used_df['weight'])
meta_all.at[i, 'dist_centroid_cosine'] = distance.cosine(element[features_used], centroid, w = features_used_df['weight'])
In [25]:
meta_all['dist_centroid_alldistances'] = (
scaler.fit_transform(meta_all[['dist_centroid_manhattan']]) +
scaler.fit_transform(meta_all[['dist_centroid_euclidean']]) +
scaler.fit_transform(meta_all[['dist_centroid_cosine']])
) / 3
In [26]:
for i, element in enumerate(meta_all.iloc):
meta_all.at[i, 'dist_centroid_manhattan_unweighted'] = distance.cityblock(element[features_used], centroid)
meta_all.at[i, 'dist_centroid_euclidean_unweighted'] = distance.euclidean(element[features_used], centroid)
meta_all.at[i, 'dist_centroid_cosine_unweighted'] = distance.cosine(element[features_used], centroid)
In [27]:
meta_all['dist_centroid_alldistances_unweighted'] = (
scaler.fit_transform(meta_all[['dist_centroid_manhattan_unweighted']]) +
scaler.fit_transform(meta_all[['dist_centroid_euclidean_unweighted']]) +
scaler.fit_transform(meta_all[['dist_centroid_cosine_unweighted']])
) / 3
Tests¶
In [28]:
meta_all.drop_duplicates(subset="author_title")[[
"author", "title", "year", "dist_centroid_euclidean",
]].sort_values(by = ["dist_centroid_euclidean", "author"], ascending = True).head(10)
Out[28]:
| author | title | year | dist_centroid_euclidean | |
|---|---|---|---|---|
| 666 | Gruppe, Otto Friedrich | Karl am Meere | 1852.0 | 1.115235 |
| 1630 | Müller von Königswinter, Wolfgang | Das Zepter Rudolfs von Habsburg | 1852.0 | 1.117486 |
| 1987 | Münchhausen, Börries von | Heerpauken | 1914.0 | 1.120395 |
| 1740 | Richter, Paul | Brusehawer | 1908.0 | 1.125591 |
| 1094 | Meyer, Conrad Ferdinand | Die Schweizer des Herrn von Tremouille | 1875.0 | 1.126415 |
| 1488 | Schrutz, Demetrius | Der Langobardentrunk | 1913.0 | 1.129181 |
| 1635 | Groth, Klaus | Graf Rudolf von Böklenburg | 1853.0 | 1.131610 |
| 242 | Zille, Moritz Alexander | Bonifacius Tod | 1851.0 | 1.133179 |
| 292 | Brunold, Friedrich | König Christian I. von Dänemark und Henning Wulf | 1859.0 | 1.136554 |
| 1396 | Lahmann, Johann Friedrich | Heinrich IV. an der Elster | 1890.0 | 1.137683 |
In [29]:
meta_all[[
'mode_score_strikt',
'mode_score_flexibel',
'dist_centroid_manhattan_unweighted',
'dist_centroid_euclidean_unweighted',
'dist_centroid_cosine_unweighted',
'dist_centroid_alldistances_unweighted',
'dist_centroid_manhattan',
'dist_centroid_euclidean',
'dist_centroid_cosine',
'dist_centroid_alldistances',
]].corr()
Out[29]:
| mode_score_strikt | mode_score_flexibel | dist_centroid_manhattan_unweighted | dist_centroid_euclidean_unweighted | dist_centroid_cosine_unweighted | dist_centroid_alldistances_unweighted | dist_centroid_manhattan | dist_centroid_euclidean | dist_centroid_cosine | dist_centroid_alldistances | |
|---|---|---|---|---|---|---|---|---|---|---|
| mode_score_strikt | 1.000000 | 0.952823 | -0.900802 | -0.914699 | -0.888176 | -0.914241 | -0.908581 | -0.903217 | -0.882121 | -0.911031 |
| mode_score_flexibel | 0.952823 | 1.000000 | -0.887655 | -0.897019 | -0.896656 | -0.906832 | -0.877581 | -0.871240 | -0.878015 | -0.886252 |
| dist_centroid_manhattan_unweighted | -0.900802 | -0.887655 | 1.000000 | 0.979699 | 0.940538 | 0.986774 | 0.880244 | 0.869071 | 0.866409 | 0.883292 |
| dist_centroid_euclidean_unweighted | -0.914699 | -0.897019 | 0.979699 | 1.000000 | 0.952941 | 0.991597 | 0.896486 | 0.901334 | 0.874381 | 0.904018 |
| dist_centroid_cosine_unweighted | -0.888176 | -0.896656 | 0.940538 | 0.952941 | 1.000000 | 0.979029 | 0.860968 | 0.856909 | 0.918110 | 0.885118 |
| dist_centroid_alldistances_unweighted | -0.914241 | -0.906832 | 0.986774 | 0.991597 | 0.979029 | 1.000000 | 0.891894 | 0.888581 | 0.899602 | 0.903857 |
| dist_centroid_manhattan | -0.908581 | -0.877581 | 0.880244 | 0.896486 | 0.860968 | 0.891894 | 1.000000 | 0.989582 | 0.947129 | 0.994697 |
| dist_centroid_euclidean | -0.903217 | -0.871240 | 0.869071 | 0.901334 | 0.856909 | 0.888581 | 0.989582 | 1.000000 | 0.938454 | 0.992968 |
| dist_centroid_cosine | -0.882121 | -0.878015 | 0.866409 | 0.874381 | 0.918110 | 0.899602 | 0.947129 | 0.938454 | 1.000000 | 0.969403 |
| dist_centroid_alldistances | -0.911031 | -0.886252 | 0.883292 | 0.904018 | 0.885118 | 0.903857 | 0.994697 | 0.992968 | 0.969403 | 1.000000 |
In [30]:
px.box(
meta_all,
x = 'mode_score_flexibel',
y = 'dist_centroid_manhattan',
points = 'all',
hover_data = ['id', 'author', 'title',]
)
Distanzen zwischen allen Texten berechnen und Distanzmatrix generieren¶
In [31]:
this_vectors = meta_all[features_used].to_numpy()
this_weights = features_used_df['weight'].values
In [32]:
dm_manhattan = distance.cdist(this_vectors, this_vectors, metric='cityblock', w=this_weights)
dm_manhattan = pd.DataFrame(dm_manhattan, index=meta_all['id'].tolist(), columns=meta_all['id'].tolist())
In [33]:
dm_euclidean = distance.cdist(this_vectors, this_vectors, metric='euclidean', w=this_weights)
dm_euclidean = pd.DataFrame(dm_euclidean, index=meta_all['id'].tolist(), columns=meta_all['id'].tolist())
In [34]:
dm_cosine = distance.cdist(this_vectors, this_vectors, metric='cosine', w=this_weights)
dm_cosine = pd.DataFrame(dm_cosine, index=meta_all['id'].tolist(), columns=meta_all['id'].tolist())
In [35]:
dm_alldistances = (
dm_manhattan/dm_manhattan.max().max() + # MinMaxScaling
dm_euclidean/dm_euclidean.max().max() +
dm_cosine/dm_cosine.max().max()
) / 3
dm_alldistances = pd.DataFrame(dm_alldistances)
dm_alldistances.index = dm_manhattan.index
dm_alldistances.columns = dm_manhattan.columns
In [36]:
dm_manhattan_unweighted = distance.cdist(this_vectors, this_vectors, metric='cityblock')
dm_manhattan_unweighted = pd.DataFrame(dm_manhattan_unweighted, index=meta_all['id'].tolist(), columns=meta_all['id'].tolist())
In [37]:
dm_euclidean_unweighted = distance.cdist(this_vectors, this_vectors, metric='euclidean')
dm_euclidean_unweighted = pd.DataFrame(dm_euclidean_unweighted, index=meta_all['id'].tolist(), columns=meta_all['id'].tolist())
In [38]:
dm_cosine_unweighted = distance.cdist(this_vectors, this_vectors, metric='cosine')
dm_cosine_unweighted = pd.DataFrame(dm_cosine_unweighted, index=meta_all['id'].tolist(), columns=meta_all['id'].tolist())
In [39]:
dm_alldistances_unweighted = (
dm_manhattan_unweighted/dm_manhattan_unweighted.max().max() + # MinMaxScaling
dm_euclidean_unweighted/dm_euclidean_unweighted.max().max() +
dm_cosine_unweighted/dm_cosine_unweighted.max().max()
) / 3
dm_alldistances_unweighted = pd.DataFrame(dm_alldistances_unweighted)
dm_alldistances_unweighted.index = dm_manhattan_unweighted.index
dm_alldistances_unweighted.columns = dm_manhattan_unweighted.columns
In [40]:
# Filter indices for rows in `meta_anth_ids`
meta_anth_ids = meta_anth['id'].tolist()
meta_anth_indices = meta_all[meta_all['id'].isin(meta_anth_ids)].index
# Compute mean distances for each metric in a vectorized manner
meta_all['dist_mean_manhattan'] = dm_manhattan[meta_anth_ids].mean(axis=1).values
meta_all['dist_mean_euclidean'] = dm_euclidean[meta_anth_ids].mean(axis=1).values
meta_all['dist_mean_cosine'] = dm_cosine[meta_anth_ids].mean(axis=1).values
meta_all['dist_mean_alldistances'] = dm_alldistances[meta_anth_ids].mean(axis=1).values
meta_all['dist_mean_manhattan_unweighted'] = dm_manhattan_unweighted[meta_anth_ids].mean(axis=1).values
meta_all['dist_mean_euclidean_unweighted'] = dm_euclidean_unweighted[meta_anth_ids].mean(axis=1).values
meta_all['dist_mean_cosine_unweighted'] = dm_cosine_unweighted[meta_anth_ids].mean(axis=1).values
meta_all['dist_mean_alldistances_unweighted'] = dm_alldistances_unweighted[meta_anth_ids].mean(axis=1).values
Tests¶
In [41]:
meta_all.sort_values(by = "dist_mean_euclidean", ascending = True)[[
'author', 'title', 'dist_mean_euclidean',
]].head(10)
Out[41]:
| author | title | dist_mean_euclidean | |
|---|---|---|---|
| 666 | Gruppe, Otto Friedrich | Karl am Meere | 1.998467 |
| 1630 | Müller von Königswinter, Wolfgang | Das Zepter Rudolfs von Habsburg | 1.998481 |
| 1094 | Meyer, Conrad Ferdinand | Die Schweizer des Herrn von Tremouille | 2.002944 |
| 1488 | Schrutz, Demetrius | Der Langobardentrunk | 2.006236 |
| 1635 | Groth, Klaus | Graf Rudolf von Böklenburg | 2.006753 |
| 292 | Brunold, Friedrich | König Christian I. von Dänemark und Henning Wulf | 2.008788 |
| 242 | Zille, Moritz Alexander | Bonifacius Tod | 2.010288 |
| 1396 | Lahmann, Johann Friedrich | Heinrich IV. an der Elster | 2.010472 |
| 27 | Grimm, Herman | Die Tochter des Langobardenkönigs | 2.010544 |
| 1629 | Halm, Friedrich | Friedrich mit der gebissenen Wange | 2.012063 |
In [42]:
meta_all[[
'mode_score_strikt', 'mode_score_flexibel',
'dist_centroid_manhattan_unweighted', 'dist_centroid_euclidean_unweighted',
'dist_centroid_cosine_unweighted', 'dist_centroid_alldistances_unweighted',
'dist_centroid_manhattan', 'dist_centroid_euclidean', 'dist_centroid_cosine', 'dist_centroid_alldistances',
'dist_mean_manhattan', 'dist_mean_euclidean', 'dist_mean_cosine', 'dist_mean_alldistances',
'dist_mean_manhattan_unweighted', 'dist_mean_euclidean_unweighted', 'dist_mean_cosine_unweighted',
'dist_mean_alldistances_unweighted'
]].corr()
Out[42]:
| mode_score_strikt | mode_score_flexibel | dist_centroid_manhattan_unweighted | dist_centroid_euclidean_unweighted | dist_centroid_cosine_unweighted | dist_centroid_alldistances_unweighted | dist_centroid_manhattan | dist_centroid_euclidean | dist_centroid_cosine | dist_centroid_alldistances | dist_mean_manhattan | dist_mean_euclidean | dist_mean_cosine | dist_mean_alldistances | dist_mean_manhattan_unweighted | dist_mean_euclidean_unweighted | dist_mean_cosine_unweighted | dist_mean_alldistances_unweighted | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| mode_score_strikt | 1.000000 | 0.952823 | -0.900802 | -0.914699 | -0.888176 | -0.914241 | -0.908581 | -0.903217 | -0.882121 | -0.911031 | -0.908174 | -0.906328 | -0.887145 | -0.911418 | -0.904079 | -0.919082 | -0.891435 | -0.915807 |
| mode_score_flexibel | 0.952823 | 1.000000 | -0.887655 | -0.897019 | -0.896656 | -0.906832 | -0.877581 | -0.871240 | -0.878015 | -0.886252 | -0.877557 | -0.873846 | -0.880373 | -0.887011 | -0.887584 | -0.901716 | -0.898782 | -0.907626 |
| dist_centroid_manhattan_unweighted | -0.900802 | -0.887655 | 1.000000 | 0.979699 | 0.940538 | 0.986774 | 0.880244 | 0.869071 | 0.866409 | 0.883292 | 0.881619 | 0.870721 | 0.869441 | 0.884200 | 0.998595 | 0.980713 | 0.942623 | 0.986269 |
| dist_centroid_euclidean_unweighted | -0.914699 | -0.897019 | 0.979699 | 1.000000 | 0.952941 | 0.991597 | 0.896486 | 0.901334 | 0.874381 | 0.904018 | 0.899069 | 0.901863 | 0.878798 | 0.903991 | 0.983311 | 0.999396 | 0.955470 | 0.990811 |
| dist_centroid_cosine_unweighted | -0.888176 | -0.896656 | 0.940538 | 0.952941 | 1.000000 | 0.979029 | 0.860968 | 0.856909 | 0.918110 | 0.885118 | 0.862470 | 0.857055 | 0.916545 | 0.886616 | 0.942132 | 0.954520 | 0.999925 | 0.979573 |
| dist_centroid_alldistances_unweighted | -0.914241 | -0.906832 | 0.986774 | 0.991597 | 0.979029 | 1.000000 | 0.891894 | 0.888581 | 0.899602 | 0.903857 | 0.893752 | 0.889345 | 0.901565 | 0.904659 | 0.988128 | 0.992260 | 0.980548 | 0.999754 |
| dist_centroid_manhattan | -0.908581 | -0.877581 | 0.880244 | 0.896486 | 0.860968 | 0.891894 | 1.000000 | 0.989582 | 0.947129 | 0.994697 | 0.998880 | 0.991083 | 0.954510 | 0.994123 | 0.880928 | 0.901617 | 0.865158 | 0.892813 |
| dist_centroid_euclidean | -0.903217 | -0.871240 | 0.869071 | 0.901334 | 0.856909 | 0.888581 | 0.989582 | 1.000000 | 0.938454 | 0.992968 | 0.990802 | 0.999380 | 0.946257 | 0.991381 | 0.871650 | 0.904848 | 0.861245 | 0.888947 |
| dist_centroid_cosine | -0.882121 | -0.878015 | 0.866409 | 0.874381 | 0.918110 | 0.899602 | 0.947129 | 0.938454 | 1.000000 | 0.969403 | 0.948296 | 0.938923 | 0.999565 | 0.971236 | 0.866761 | 0.879664 | 0.919263 | 0.901389 |
| dist_centroid_alldistances | -0.911031 | -0.886252 | 0.883292 | 0.904018 | 0.885118 | 0.903857 | 0.994697 | 0.992968 | 0.969403 | 1.000000 | 0.995098 | 0.993368 | 0.975010 | 0.999639 | 0.884659 | 0.908604 | 0.888628 | 0.904792 |
| dist_mean_manhattan | -0.908174 | -0.877557 | 0.881619 | 0.899069 | 0.862470 | 0.893752 | 0.998880 | 0.990802 | 0.948296 | 0.995098 | 1.000000 | 0.992528 | 0.955772 | 0.995407 | 0.883703 | 0.904222 | 0.866736 | 0.895136 |
| dist_mean_euclidean | -0.906328 | -0.873846 | 0.870721 | 0.901863 | 0.857055 | 0.889345 | 0.991083 | 0.999380 | 0.938923 | 0.993368 | 0.992528 | 1.000000 | 0.946930 | 0.992440 | 0.873482 | 0.906108 | 0.861505 | 0.890062 |
| dist_mean_cosine | -0.887145 | -0.880373 | 0.869441 | 0.878798 | 0.916545 | 0.901565 | 0.954510 | 0.946257 | 0.999565 | 0.975010 | 0.955772 | 0.946930 | 1.000000 | 0.976854 | 0.870009 | 0.884124 | 0.918031 | 0.903375 |
| dist_mean_alldistances | -0.911418 | -0.887011 | 0.884200 | 0.903991 | 0.886616 | 0.904659 | 0.994123 | 0.991381 | 0.971236 | 0.999639 | 0.995407 | 0.992440 | 0.976854 | 1.000000 | 0.886078 | 0.908950 | 0.890155 | 0.905964 |
| dist_mean_manhattan_unweighted | -0.904079 | -0.887584 | 0.998595 | 0.983311 | 0.942132 | 0.988128 | 0.880928 | 0.871650 | 0.866761 | 0.884659 | 0.883703 | 0.873482 | 0.870009 | 0.886078 | 1.000000 | 0.984388 | 0.944320 | 0.988446 |
| dist_mean_euclidean_unweighted | -0.919082 | -0.901716 | 0.980713 | 0.999396 | 0.954520 | 0.992260 | 0.901617 | 0.904848 | 0.879664 | 0.908604 | 0.904222 | 0.906108 | 0.884124 | 0.908950 | 0.984388 | 1.000000 | 0.957110 | 0.991975 |
| dist_mean_cosine_unweighted | -0.891435 | -0.898782 | 0.942623 | 0.955470 | 0.999925 | 0.980548 | 0.865158 | 0.861245 | 0.919263 | 0.888628 | 0.866736 | 0.861505 | 0.918031 | 0.890155 | 0.944320 | 0.957110 | 1.000000 | 0.981126 |
| dist_mean_alldistances_unweighted | -0.915807 | -0.907626 | 0.986269 | 0.990811 | 0.979573 | 0.999754 | 0.892813 | 0.888947 | 0.901389 | 0.904792 | 0.895136 | 0.890062 | 0.903375 | 0.905964 | 0.988446 | 0.991975 | 0.981126 | 1.000000 |
Export¶
In [43]:
dm_manhattan.to_csv("../resources/more/vectors/vectordist_dm_manhattan.csv")
dm_euclidean.to_csv("../resources/more/vectors/vectordist_dm_euclidean.csv")
dm_cosine.to_csv("../resources/more/vectors/vectordist_dm_cosine.csv")
dm_alldistances.to_csv("../resources/more/vectors/vectordist_dm_alldistances.csv")
In [44]:
dm_manhattan_unweighted.to_csv("../resources/more/vectors/vectordist_dm_manhattan_unweighted.csv")
dm_euclidean_unweighted.to_csv("../resources/more/vectors/vectordist_dm_euclidean_unweighted.csv")
dm_cosine_unweighted.to_csv("../resources/more/vectors/vectordist_dm_cosine_unweighted.csv")
dm_alldistances_unweighted.to_csv("../resources/more/vectors/vectordist_dm_alldistances_unweighted.csv")
In [45]:
export_meta = meta_all[
['id'] + [
'dist_centroid_manhattan', 'dist_centroid_euclidean', 'dist_centroid_cosine', 'dist_centroid_alldistances',
'dist_centroid_manhattan_unweighted', 'dist_centroid_euclidean_unweighted',
'dist_centroid_cosine_unweighted', 'dist_centroid_alldistances_unweighted',
'dist_mean_manhattan', 'dist_mean_euclidean', 'dist_mean_cosine', 'dist_mean_alldistances',
'dist_mean_manhattan_unweighted', 'dist_mean_euclidean_unweighted',
'dist_mean_cosine_unweighted', 'dist_mean_alldistances_unweighted',
]
]
export_meta.to_csv("../resources/more/vectors/vectordist_dists.csv")
In [46]:
export_meta.head()
Out[46]:
| id | dist_centroid_manhattan | dist_centroid_euclidean | dist_centroid_cosine | dist_centroid_alldistances | dist_centroid_manhattan_unweighted | dist_centroid_euclidean_unweighted | dist_centroid_cosine_unweighted | dist_centroid_alldistances_unweighted | dist_mean_manhattan | dist_mean_euclidean | dist_mean_cosine | dist_mean_alldistances | dist_mean_manhattan_unweighted | dist_mean_euclidean_unweighted | dist_mean_cosine_unweighted | dist_mean_alldistances_unweighted | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1850.Grube.028 | 6.046027 | 1.471513 | 0.112839 | 0.147303 | 22.668557 | 3.031454 | 0.180582 | 0.289637 | 6.231232 | 2.267290 | 0.232499 | 0.375783 | 23.948690 | 4.405008 | 0.330064 | 0.527177 |
| 1 | 1850.Kriebitzsch.001 | 11.063399 | 2.707721 | 0.389565 | 0.713911 | 32.726389 | 4.285712 | 0.364058 | 0.737184 | 11.405351 | 3.231843 | 0.474081 | 0.634671 | 33.671350 | 5.358220 | 0.480409 | 0.708324 |
| 2 | 1850.Kriebitzsch.011 | 7.459599 | 1.886688 | 0.138009 | 0.282977 | 23.414010 | 3.227017 | 0.182180 | 0.322968 | 7.579034 | 2.569597 | 0.257942 | 0.432729 | 24.557230 | 4.544311 | 0.332335 | 0.538962 |
| 3 | 1850.Kriebitzsch.019 | 6.586925 | 1.632293 | 0.109394 | 0.190895 | 24.211065 | 3.304033 | 0.186562 | 0.346566 | 6.665785 | 2.376551 | 0.231034 | 0.391330 | 25.070353 | 4.596473 | 0.335240 | 0.546190 |
| 4 | 1851.Müller/Kletke.018 | 8.133864 | 2.118448 | 0.194718 | 0.382714 | 27.811599 | 3.832658 | 0.243543 | 0.507471 | 8.428754 | 2.756721 | 0.306317 | 0.480857 | 28.988117 | 5.006438 | 0.382807 | 0.614177 |